# Assumes that the following variables are present:
#   d (full Chrises data: right (data) side of the main table) 
#   ind (indices and stage/box markup, first 3 columns (left) of the main table)
#   scores (components c1 and c2, with indices in the first row; tab "full scores" from file "clustering")

require(mi);
require(ggplot2);
# require(isopam);
require(lle);

# g = (ind$boxed==0 & ind$stage>=48); # Only older, only naive
# g = (ind$boxed==0 & ind$stage>=1); # All naive
g = ind$stage>=1;   # All. For clustering analysis ALL is much better.

d2 =   subset(d,  g);
ind2 = subset(ind,g);

mdf = missing_data.frame(d2);
mdf = change(mdf, y = c("wavetau","synisi","q","ppr","poly","minif","minii","jitter"), 
             what = "type", to = "pos");

imputations = mi(mdf, n.iter = 20, n.chains = 4, max.minutes = 5,
                 parallel=FALSE,verbose=FALSE);   # The long one (takes some time)
# 20,4,5 works

m = 50;   # How many imputations
k = complete(imputations, m = m);

for(q in 1:length(k)) {
  k[[q]] = k[[q]][1:33];  # Cut off the diagnostic part with TRUE FALSE-es at the end
}

diagnostic = 0;
for(q in 1:length(k)) {
  diagnostic[q] = sum( apply(k[[q]],2,sd) / apply(k[[q]],2,mean) );
}
length(unique(diagnostic))
qplot(diagnostic,binwidth=0.1)
# qplot(1:length(k),diagnostic)

choice = 9;
nClusters = 5;

#hc = hclust(dist(comps[c(2,3)], method = "manhattan"), "ward.D2");  # Test (simple 2d cluster)
hc = hclust(dist(k[[choice]], method = "euclidean"), "ward.D2"); # manhattan or euclidean as alternatives for dist
 plot(hc); rect.hclust(hc, k = nClusters);

ct = cutree(hc,nClusters);
# write.table(data.frame(ind2$id,ct), "r_cluster_output.txt", sep="\t", row.names=FALSE)

img = data.frame(ind2, subset(scores,scores[[1]] %in% ind2[[1]]),ct);
img[[4]] = NULL;  # Delete repeated column
ggplot(data=img,aes(c1,c2)) + geom_point(aes(color=factor(ct),size=7)) + theme_bw();



### ---------------- Amount of clustering

# g = ind$boxed > -1; # all
# g = (ind$boxed==0 & ind$stage %in% c(42,43,44)); # The only one that wouldn't work. n=11
 g = (ind$boxed==0 & ind$stage %in% c(45,46)); # n=64
# g = (ind$boxed==0 & ind$stage %in% c(47));  # n=24
# g = (ind$boxed==0 & ind$stage %in% c(48,49)); # n=56
# g = (ind$boxed==1 & ind$stage %in% c(48,49)); # n=60
print(sum(g))

bag = 0;
counter = 0;
tosample = 20;
for(choice in 1:m) {
  dataN = data.frame(scale(k[[choice]]))[g,];
  for(iSample in 1:10) {
    temp = dataN[sample(nrow(dataN),tosample),];  # Sample some rows at random
    counter = counter+1;
    # a = agnes(dist(temp, method="manhattan")); bag[counter] = a$ac; # Amount of clustering
    # p = princomp(temp); bag[counter] = sum(p$sd[1:2]^2)/sum(p$sd^2); # Var explained by local PCA
    p = dist(temp,method="manhattan"); bag[counter] = median(as.vector(p)); # Size of the cluster
  }
}
qplot(bag) + theme_classic();
print(mean(bag));
print(sd(bag));
# write.table(bag, row.names=F)

### ----------------- Below: A competition of dimensionality reduction methods
dataN = data.frame(scale(k[[choice]])); # center = true and scale=true by default
mydist = dist(dataN, method="manhattan");

### PCA on imputed data (linear method)
p = princomp(dataN);
# qplot(p$scores[,1],p$scores[,2]) + theme_bw();
newdist = dist(p$scores[,1:2],method="euclidean");
qplot(as.vector(mydist),as.vector(newdist)) + theme_bw();
cor(as.vector(mydist),as.vector(newdist))^2

### Multidimensional Scaling (linear method)
r = cmdscale(mydist, 2, eig=TRUE);
img = data.frame(x=r$points[,1],y=r$points[,2],ct=ct);
# ggplot(data=img,aes(x,y)) + geom_point(aes(color=factor(ct),size=7)) + theme_bw();
newdist = dist(r$points,method="euclidean");
qplot(as.vector(mydist),as.vector(newdist)) + theme_bw();
cor(as.vector(mydist),as.vector(newdist))^2

# Isomapping
i = isomap(mydist, ndim=2,k=20);
# sum(i$eig[1:2])/sum(i$eig)
img = data.frame(x=i$points[,1],y=i$points[,2],ct=ct);
ggplot(data=img,aes(x,y)) + geom_point(aes(color=factor(ct),size=7)) + theme_bw();
newdist = dist(i$points,method="euclidean");
qplot(as.vector(mydist),as.vector(newdist)) + theme_bw();
cor(as.vector(mydist),as.vector(newdist))^2

# Local Linear Embedding
# calc_k(dataN, 2, kmin=1, kmax=20, plotres=T,parallel=F, cpus=2, iLLE=F); # The answer is 19
r = lle(dataN,m=2,k=19);
img = data.frame(x=r$Y[,1],y=r$Y[,2],ct=ct);
ggplot(data=img,aes(x,y)) + geom_point(aes(color=factor(ct),size=7)) + theme_bw();
newdist = dist(r$Y,method="euclidean");
qplot(as.vector(mydist),as.vector(newdist)) + theme_bw();
cor(as.vector(mydist),as.vector(newdist))^2

### Our Bayesian PCA coeffs. Not quite relevant, as all the rest use imputed data
tempscores = subset(scores,scores[[1]] %in% ind2[[1]]);
newdist = dist(tempscores[2:3],method="euclidean");
# qplot(tempscores[[2]], tempscores[[3]]) + theme_bw();
qplot(as.vector(mydist),as.vector(newdist)) + theme_bw();
cor(as.vector(mydist),as.vector(newdist))^2